import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline
plt.style.use("ggplot")
df = pd.read_csv("../HW2/Bacteria2.csv")
df
| Bacteria | Penicilin | Streptomycin | Neomycin | Gram | |
|---|---|---|---|---|---|
| 0 | Aerobacter aerogenes | 870.000 | 1.00 | 1.600 | negative |
| 1 | Brucella abortus | 1.000 | 2.00 | 0.020 | negative |
| 2 | Brucella anthracis | 0.001 | 0.01 | 0.007 | positive |
| 3 | Diplococcus pneumoniae | 0.005 | 11.00 | 10.000 | positive |
| 4 | Escherichia coli | 100.000 | 0.40 | 0.100 | negative |
| 5 | Klebsiella pneumoniae | 850.000 | 1.20 | 1.000 | negative |
| 6 | Mycobacterium tuberculosis | 800.000 | 5.00 | 2.000 | negative |
| 7 | Proteus vulgaris | 3.000 | 0.10 | 0.100 | negative |
| 8 | Pseudomonas aeruginosa | 850.000 | 2.00 | 0.400 | negative |
| 9 | Salmonella (Eberthella) typhosa | 1.000 | 0.40 | 0.008 | negative |
| 10 | Salmonella schottmuelleri | 10.000 | 0.80 | 0.090 | negative |
| 11 | Staphylococcus albus | 0.007 | 0.10 | 0.001 | positive |
| 12 | Staphylococcus aureus | 0.030 | 0.03 | 0.001 | positive |
| 13 | Streptococcus fecalis | 1.000 | 1.00 | 0.100 | positive |
| 14 | Streptococcus hemolyticus | 0.001 | 14.00 | 10.000 | positive |
| 15 | Streptococcus viridans | 0.005 | 10.00 | 40.000 | positive |
df.columns
Index(['Bacteria', 'Penicilin', 'Streptomycin', 'Neomycin', 'Gram'], dtype='object')
df.Penicilin
0 870.000 1 1.000 2 0.001 3 0.005 4 100.000 5 850.000 6 800.000 7 3.000 8 850.000 9 1.000 10 10.000 11 0.007 12 0.030 13 1.000 14 0.001 15 0.005 Name: Penicilin, dtype: float64
df["PenicilinL"] = np.log10(df.Penicilin)
df['StreptomycinL'] = np.log10(df.Streptomycin)
df['NeomycinL'] = np.log10(df.Neomycin)
df
| Bacteria | Penicilin | Streptomycin | Neomycin | Gram | PenicilinL | StreptomycinL | NeomycinL | |
|---|---|---|---|---|---|---|---|---|
| 0 | Aerobacter aerogenes | 870.000 | 1.00 | 1.600 | negative | 2.939519 | 0.000000 | 0.204120 |
| 1 | Brucella abortus | 1.000 | 2.00 | 0.020 | negative | 0.000000 | 0.301030 | -1.698970 |
| 2 | Brucella anthracis | 0.001 | 0.01 | 0.007 | positive | -3.000000 | -2.000000 | -2.154902 |
| 3 | Diplococcus pneumoniae | 0.005 | 11.00 | 10.000 | positive | -2.301030 | 1.041393 | 1.000000 |
| 4 | Escherichia coli | 100.000 | 0.40 | 0.100 | negative | 2.000000 | -0.397940 | -1.000000 |
| 5 | Klebsiella pneumoniae | 850.000 | 1.20 | 1.000 | negative | 2.929419 | 0.079181 | 0.000000 |
| 6 | Mycobacterium tuberculosis | 800.000 | 5.00 | 2.000 | negative | 2.903090 | 0.698970 | 0.301030 |
| 7 | Proteus vulgaris | 3.000 | 0.10 | 0.100 | negative | 0.477121 | -1.000000 | -1.000000 |
| 8 | Pseudomonas aeruginosa | 850.000 | 2.00 | 0.400 | negative | 2.929419 | 0.301030 | -0.397940 |
| 9 | Salmonella (Eberthella) typhosa | 1.000 | 0.40 | 0.008 | negative | 0.000000 | -0.397940 | -2.096910 |
| 10 | Salmonella schottmuelleri | 10.000 | 0.80 | 0.090 | negative | 1.000000 | -0.096910 | -1.045757 |
| 11 | Staphylococcus albus | 0.007 | 0.10 | 0.001 | positive | -2.154902 | -1.000000 | -3.000000 |
| 12 | Staphylococcus aureus | 0.030 | 0.03 | 0.001 | positive | -1.522879 | -1.522879 | -3.000000 |
| 13 | Streptococcus fecalis | 1.000 | 1.00 | 0.100 | positive | 0.000000 | 0.000000 | -1.000000 |
| 14 | Streptococcus hemolyticus | 0.001 | 14.00 | 10.000 | positive | -3.000000 | 1.146128 | 1.000000 |
| 15 | Streptococcus viridans | 0.005 | 10.00 | 40.000 | positive | -2.301030 | 1.000000 | 1.602060 |
from sklearn.manifold import MDS
logdf = df[["PenicilinL", "StreptomycinL", "NeomycinL"]]
logdf
| PenicilinL | StreptomycinL | NeomycinL | |
|---|---|---|---|
| 0 | 2.939519 | 0.000000 | 0.204120 |
| 1 | 0.000000 | 0.301030 | -1.698970 |
| 2 | -3.000000 | -2.000000 | -2.154902 |
| 3 | -2.301030 | 1.041393 | 1.000000 |
| 4 | 2.000000 | -0.397940 | -1.000000 |
| 5 | 2.929419 | 0.079181 | 0.000000 |
| 6 | 2.903090 | 0.698970 | 0.301030 |
| 7 | 0.477121 | -1.000000 | -1.000000 |
| 8 | 2.929419 | 0.301030 | -0.397940 |
| 9 | 0.000000 | -0.397940 | -2.096910 |
| 10 | 1.000000 | -0.096910 | -1.045757 |
| 11 | -2.154902 | -1.000000 | -3.000000 |
| 12 | -1.522879 | -1.522879 | -3.000000 |
| 13 | 0.000000 | 0.000000 | -1.000000 |
| 14 | -3.000000 | 1.146128 | 1.000000 |
| 15 | -2.301030 | 1.000000 | 1.602060 |
df_MDS = MDS(n_components=2).fit_transform(logdf)
df_MDS
array([[-2.50140065, -1.77138718],
[ 0.01711131, 0.57776751],
[ 2.3777941 , 3.08819238],
[ 2.89605844, -1.29014142],
[-1.97173541, -0.28504887],
[-2.51786281, -1.61837219],
[-2.26187438, -2.19891479],
[-0.82293742, 0.60928886],
[-2.63552152, -1.31055685],
[-0.31132106, 1.2724154 ],
[-0.93461712, -0.15453806],
[ 1.34423962, 2.93766582],
[ 0.65608948, 3.01588131],
[ 0.04672363, 0.07080214],
[ 3.5832936 , -1.12704749],
[ 3.03596019, -1.81600658]])
df_MDS.shape
(16, 2)
df["MDS_x"] = df_MDS[:,0]
df['MDS_y'] = df_MDS[:,1]
df
| Bacteria | Penicilin | Streptomycin | Neomycin | Gram | PenicilinL | StreptomycinL | NeomycinL | MDS_x | MDS_y | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Aerobacter aerogenes | 870.000 | 1.00 | 1.600 | negative | 2.939519 | 0.000000 | 0.204120 | -2.501401 | -1.771387 |
| 1 | Brucella abortus | 1.000 | 2.00 | 0.020 | negative | 0.000000 | 0.301030 | -1.698970 | 0.017111 | 0.577768 |
| 2 | Brucella anthracis | 0.001 | 0.01 | 0.007 | positive | -3.000000 | -2.000000 | -2.154902 | 2.377794 | 3.088192 |
| 3 | Diplococcus pneumoniae | 0.005 | 11.00 | 10.000 | positive | -2.301030 | 1.041393 | 1.000000 | 2.896058 | -1.290141 |
| 4 | Escherichia coli | 100.000 | 0.40 | 0.100 | negative | 2.000000 | -0.397940 | -1.000000 | -1.971735 | -0.285049 |
| 5 | Klebsiella pneumoniae | 850.000 | 1.20 | 1.000 | negative | 2.929419 | 0.079181 | 0.000000 | -2.517863 | -1.618372 |
| 6 | Mycobacterium tuberculosis | 800.000 | 5.00 | 2.000 | negative | 2.903090 | 0.698970 | 0.301030 | -2.261874 | -2.198915 |
| 7 | Proteus vulgaris | 3.000 | 0.10 | 0.100 | negative | 0.477121 | -1.000000 | -1.000000 | -0.822937 | 0.609289 |
| 8 | Pseudomonas aeruginosa | 850.000 | 2.00 | 0.400 | negative | 2.929419 | 0.301030 | -0.397940 | -2.635522 | -1.310557 |
| 9 | Salmonella (Eberthella) typhosa | 1.000 | 0.40 | 0.008 | negative | 0.000000 | -0.397940 | -2.096910 | -0.311321 | 1.272415 |
| 10 | Salmonella schottmuelleri | 10.000 | 0.80 | 0.090 | negative | 1.000000 | -0.096910 | -1.045757 | -0.934617 | -0.154538 |
| 11 | Staphylococcus albus | 0.007 | 0.10 | 0.001 | positive | -2.154902 | -1.000000 | -3.000000 | 1.344240 | 2.937666 |
| 12 | Staphylococcus aureus | 0.030 | 0.03 | 0.001 | positive | -1.522879 | -1.522879 | -3.000000 | 0.656089 | 3.015881 |
| 13 | Streptococcus fecalis | 1.000 | 1.00 | 0.100 | positive | 0.000000 | 0.000000 | -1.000000 | 0.046724 | 0.070802 |
| 14 | Streptococcus hemolyticus | 0.001 | 14.00 | 10.000 | positive | -3.000000 | 1.146128 | 1.000000 | 3.583294 | -1.127047 |
| 15 | Streptococcus viridans | 0.005 | 10.00 | 40.000 | positive | -2.301030 | 1.000000 | 1.602060 | 3.035960 | -1.816007 |
bacteriaAbbreviations = {'Aerobacter aerogenes':"Aerobacter",
'Brucella abortus':"Brucella ABO",
'Brucella anthracis':"Brucella ANT",
'Diplococcus pneumoniae': "Diplococcus",
'Escherichia coli': "Escherichia",
'Klebsiella pneumoniae': "Klebsiella",
'Mycobacterium tuberculosis':'Mycobacterium' ,
'Proteus vulgaris':'Proteus',
'Pseudomonas aeruginosa':"Pseudomonas" ,
'Salmonella (Eberthella) typhosa':'Sal. typhosa',
'Salmonella schottmuelleri':'Sal. scho.',
'Staphylococcus albus':'S. albus',
'Staphylococcus aureus':'S. aureus',
'Streptococcus fecalis':"S. fecalis",
'Streptococcus hemolyticus':"S. hemolyticus",
'Streptococcus viridans':"S. viridans"}
df['bacteriaAbbr'] = df.Bacteria.map(bacteriaAbbreviations)
df.head()
| Bacteria | Penicilin | Streptomycin | Neomycin | Gram | PenicilinL | StreptomycinL | NeomycinL | MDS_x | MDS_y | bacteriaAbbr | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Aerobacter aerogenes | 870.000 | 1.00 | 1.600 | negative | 2.939519 | 0.000000 | 0.204120 | -2.501401 | -1.771387 | Aerobacter |
| 1 | Brucella abortus | 1.000 | 2.00 | 0.020 | negative | 0.000000 | 0.301030 | -1.698970 | 0.017111 | 0.577768 | Brucella ABO |
| 2 | Brucella anthracis | 0.001 | 0.01 | 0.007 | positive | -3.000000 | -2.000000 | -2.154902 | 2.377794 | 3.088192 | Brucella ANT |
| 3 | Diplococcus pneumoniae | 0.005 | 11.00 | 10.000 | positive | -2.301030 | 1.041393 | 1.000000 | 2.896058 | -1.290141 | Diplococcus |
| 4 | Escherichia coli | 100.000 | 0.40 | 0.100 | negative | 2.000000 | -0.397940 | -1.000000 | -1.971735 | -0.285049 | Escherichia |
fig = px.scatter(df, x="MDS_x", y="MDS_y",
text="bacteriaAbbr", width=800, height=800)
fig
df['BestAntibiotic'] = df[["Penicilin", "Streptomycin", "Neomycin"]].idxmin(axis=1)
df
| Bacteria | Penicilin | Streptomycin | Neomycin | Gram | PenicilinL | StreptomycinL | NeomycinL | MDS_x | MDS_y | bacteriaAbbr | BestAntibiotic | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Aerobacter aerogenes | 870.000 | 1.00 | 1.600 | negative | 2.939519 | 0.000000 | 0.204120 | -2.501401 | -1.771387 | Aerobacter | Streptomycin |
| 1 | Brucella abortus | 1.000 | 2.00 | 0.020 | negative | 0.000000 | 0.301030 | -1.698970 | 0.017111 | 0.577768 | Brucella ABO | Neomycin |
| 2 | Brucella anthracis | 0.001 | 0.01 | 0.007 | positive | -3.000000 | -2.000000 | -2.154902 | 2.377794 | 3.088192 | Brucella ANT | Penicilin |
| 3 | Diplococcus pneumoniae | 0.005 | 11.00 | 10.000 | positive | -2.301030 | 1.041393 | 1.000000 | 2.896058 | -1.290141 | Diplococcus | Penicilin |
| 4 | Escherichia coli | 100.000 | 0.40 | 0.100 | negative | 2.000000 | -0.397940 | -1.000000 | -1.971735 | -0.285049 | Escherichia | Neomycin |
| 5 | Klebsiella pneumoniae | 850.000 | 1.20 | 1.000 | negative | 2.929419 | 0.079181 | 0.000000 | -2.517863 | -1.618372 | Klebsiella | Neomycin |
| 6 | Mycobacterium tuberculosis | 800.000 | 5.00 | 2.000 | negative | 2.903090 | 0.698970 | 0.301030 | -2.261874 | -2.198915 | Mycobacterium | Neomycin |
| 7 | Proteus vulgaris | 3.000 | 0.10 | 0.100 | negative | 0.477121 | -1.000000 | -1.000000 | -0.822937 | 0.609289 | Proteus | Streptomycin |
| 8 | Pseudomonas aeruginosa | 850.000 | 2.00 | 0.400 | negative | 2.929419 | 0.301030 | -0.397940 | -2.635522 | -1.310557 | Pseudomonas | Neomycin |
| 9 | Salmonella (Eberthella) typhosa | 1.000 | 0.40 | 0.008 | negative | 0.000000 | -0.397940 | -2.096910 | -0.311321 | 1.272415 | Sal. typhosa | Neomycin |
| 10 | Salmonella schottmuelleri | 10.000 | 0.80 | 0.090 | negative | 1.000000 | -0.096910 | -1.045757 | -0.934617 | -0.154538 | Sal. scho. | Neomycin |
| 11 | Staphylococcus albus | 0.007 | 0.10 | 0.001 | positive | -2.154902 | -1.000000 | -3.000000 | 1.344240 | 2.937666 | S. albus | Neomycin |
| 12 | Staphylococcus aureus | 0.030 | 0.03 | 0.001 | positive | -1.522879 | -1.522879 | -3.000000 | 0.656089 | 3.015881 | S. aureus | Neomycin |
| 13 | Streptococcus fecalis | 1.000 | 1.00 | 0.100 | positive | 0.000000 | 0.000000 | -1.000000 | 0.046724 | 0.070802 | S. fecalis | Neomycin |
| 14 | Streptococcus hemolyticus | 0.001 | 14.00 | 10.000 | positive | -3.000000 | 1.146128 | 1.000000 | 3.583294 | -1.127047 | S. hemolyticus | Penicilin |
| 15 | Streptococcus viridans | 0.005 | 10.00 | 40.000 | positive | -2.301030 | 1.000000 | 1.602060 | 3.035960 | -1.816007 | S. viridans | Penicilin |
df['BestAntibioticAbbr'] = df['BestAntibiotic'].map(lambda x:x[0])
df.head()
| Bacteria | Penicilin | Streptomycin | Neomycin | Gram | PenicilinL | StreptomycinL | NeomycinL | MDS_x | MDS_y | bacteriaAbbr | BestAntibiotic | BestAntibioticAbbr | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Aerobacter aerogenes | 870.000 | 1.00 | 1.600 | negative | 2.939519 | 0.000000 | 0.204120 | -2.501401 | -1.771387 | Aerobacter | Streptomycin | S |
| 1 | Brucella abortus | 1.000 | 2.00 | 0.020 | negative | 0.000000 | 0.301030 | -1.698970 | 0.017111 | 0.577768 | Brucella ABO | Neomycin | N |
| 2 | Brucella anthracis | 0.001 | 0.01 | 0.007 | positive | -3.000000 | -2.000000 | -2.154902 | 2.377794 | 3.088192 | Brucella ANT | Penicilin | P |
| 3 | Diplococcus pneumoniae | 0.005 | 11.00 | 10.000 | positive | -2.301030 | 1.041393 | 1.000000 | 2.896058 | -1.290141 | Diplococcus | Penicilin | P |
| 4 | Escherichia coli | 100.000 | 0.40 | 0.100 | negative | 2.000000 | -0.397940 | -1.000000 | -1.971735 | -0.285049 | Escherichia | Neomycin | N |
fig = px.scatter(df, x="MDS_x", y="MDS_y", color="BestAntibiotic",
text="bacteriaAbbr",
width=800, height=800)
def add_trace_copy(trace):
fig.add_traces(trace)
new_trace = fig.data[-1]
new_trace.update(textfont_color=trace.marker.color, textposition='top center',
mode="text", showlegend=False)
trace.update(mode="markers")
fig.for_each_trace(add_trace_copy)
fig.update_xaxes(
range=[-4,4], # sets the range of xaxis
constrain="domain", # meanwhile compresses the xaxis by decreasing its "domain"
)
fig.update_yaxes(
scaleanchor = "x",
scaleratio = 1,
)
fig.show()